In [3]:
import pandas as pd
# Load the CSV file with semicolon as delimiter
data = pd.read_csv("Customer_Behavior_Data.csv", delimiter=";")
# Display the first few rows to check the structure
print(data.head())
account length location code user id credit card info save push status \ 0 128 415 3824657 no yes 1 107 415 3717191 no yes 2 137 415 3581921 no no 3 84 408 3759999 yes no 4 75 415 3306626 yes no add to wishlist desktop sessions app sessions desktop transactions \ 0 25 265 45 17 1 26 162 27 17 2 0 243 41 10 3 0 299 51 5 4 0 167 28 13 total product detail views session duration promotion clicks \ 0 110 197 87 1 123 196 103 2 114 121 110 3 71 62 88 4 113 148 122 avg order value sale product views discount rate per visited products \ 0 244,7 91 11,01 1 254,4 103 11,45 2 162,6 104 7,32 3 196,9 89 8,86 4 186,9 121 8,41 product detail view per app session app transactions \ 0 10 3 1 13,7 3 2 12,2 5 3 6,6 7 4 10,1 3 add to cart per session customer service calls churn 0 2,7 1 0 1 3,7 1 0 2 3,29 0 0 3 1,78 2 0 4 2,73 3 0
In [4]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [5]:
data
Out[5]:
| account length | location code | user id | credit card info save | push status | add to wishlist | desktop sessions | app sessions | desktop transactions | total product detail views | session duration | promotion clicks | avg order value | sale product views | discount rate per visited products | product detail view per app session | app transactions | add to cart per session | customer service calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 415 | 3824657 | no | yes | 25 | 265 | 45 | 17 | 110 | 197 | 87 | 244,7 | 91 | 11,01 | 10 | 3 | 2,7 | 1 | 0 |
| 1 | 107 | 415 | 3717191 | no | yes | 26 | 162 | 27 | 17 | 123 | 196 | 103 | 254,4 | 103 | 11,45 | 13,7 | 3 | 3,7 | 1 | 0 |
| 2 | 137 | 415 | 3581921 | no | no | 0 | 243 | 41 | 10 | 114 | 121 | 110 | 162,6 | 104 | 7,32 | 12,2 | 5 | 3,29 | 0 | 0 |
| 3 | 84 | 408 | 3759999 | yes | no | 0 | 299 | 51 | 5 | 71 | 62 | 88 | 196,9 | 89 | 8,86 | 6,6 | 7 | 1,78 | 2 | 0 |
| 4 | 75 | 415 | 3306626 | yes | no | 0 | 167 | 28 | 13 | 113 | 148 | 122 | 186,9 | 121 | 8,41 | 10,1 | 3 | 2,73 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3328 | 192 | 415 | 4144276 | no | yes | 36 | 156 | 27 | 18 | 77 | 216 | 126 | 279,1 | 83 | 12,56 | 9,9 | 6 | 2,67 | 2 | 0 |
| 3329 | 68 | 415 | 3703271 | no | no | 0 | 231 | 39 | 13 | 57 | 153 | 55 | 191,3 | 123 | 8,61 | 9,6 | 4 | 2,59 | 3 | 0 |
| 3330 | 28 | 510 | 3288230 | no | no | 0 | 181 | 31 | 25 | 109 | 289 | 58 | 191,9 | 91 | 8,64 | 14,1 | 6 | 3,81 | 2 | 0 |
| 3331 | 184 | 510 | 3646381 | yes | no | 0 | 214 | 36 | 14 | 105 | 160 | 84 | 139,2 | 137 | 6,26 | 5 | 10 | 1,35 | 2 | 0 |
| 3332 | 74 | 415 | 4004344 | no | yes | 25 | 234 | 40 | 23 | 113 | 266 | 82 | 241,4 | 77 | 10,86 | 13,7 | 4 | 3,7 | 0 | 0 |
3333 rows × 20 columns
In [6]:
# Get data types and check for missing values
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3333 entries, 0 to 3332 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 account length 3333 non-null int64 1 location code 3333 non-null int64 2 user id 3333 non-null int64 3 credit card info save 3333 non-null object 4 push status 3333 non-null object 5 add to wishlist 3333 non-null int64 6 desktop sessions 3333 non-null int64 7 app sessions 3333 non-null int64 8 desktop transactions 3333 non-null int64 9 total product detail views 3333 non-null int64 10 session duration 3333 non-null int64 11 promotion clicks 3333 non-null int64 12 avg order value 3333 non-null object 13 sale product views 3333 non-null int64 14 discount rate per visited products 3333 non-null object 15 product detail view per app session 3333 non-null object 16 app transactions 3333 non-null int64 17 add to cart per session 3333 non-null object 18 customer service calls 3333 non-null int64 19 churn 3333 non-null int64 dtypes: int64(14), object(6) memory usage: 520.9+ KB
In [7]:
# Check for missing values
data.isnull().sum()
Out[7]:
| 0 | |
|---|---|
| account length | 0 |
| location code | 0 |
| user id | 0 |
| credit card info save | 0 |
| push status | 0 |
| add to wishlist | 0 |
| desktop sessions | 0 |
| app sessions | 0 |
| desktop transactions | 0 |
| total product detail views | 0 |
| session duration | 0 |
| promotion clicks | 0 |
| avg order value | 0 |
| sale product views | 0 |
| discount rate per visited products | 0 |
| product detail view per app session | 0 |
| app transactions | 0 |
| add to cart per session | 0 |
| customer service calls | 0 |
| churn | 0 |
In [8]:
# Basic statistics
data.describe().T
Out[8]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| account length | 3333.0 | 1.010648e+02 | 39.822106 | 1.0 | 74.0 | 101.0 | 127.0 | 243.0 |
| location code | 3333.0 | 4.371824e+02 | 42.371290 | 408.0 | 408.0 | 415.0 | 510.0 | 510.0 |
| user id | 3333.0 | 3.746291e+06 | 274662.573752 | 3271058.0 | 3508680.0 | 3748187.0 | 3985970.0 | 4229964.0 |
| add to wishlist | 3333.0 | 8.099010e+00 | 13.688365 | 0.0 | 0.0 | 0.0 | 20.0 | 51.0 |
| desktop sessions | 3333.0 | 1.798119e+02 | 54.457135 | 0.0 | 144.0 | 179.0 | 216.0 | 351.0 |
| app sessions | 3333.0 | 3.056796e+01 | 9.269376 | 0.0 | 24.0 | 31.0 | 37.0 | 60.0 |
| desktop transactions | 3333.0 | 1.708761e+01 | 4.323795 | 0.0 | 14.0 | 17.0 | 20.0 | 31.0 |
| total product detail views | 3333.0 | 1.004356e+02 | 20.069084 | 0.0 | 87.0 | 101.0 | 114.0 | 165.0 |
| session duration | 3333.0 | 2.010396e+02 | 50.714359 | 0.0 | 167.0 | 201.0 | 235.0 | 364.0 |
| promotion clicks | 3333.0 | 1.001107e+02 | 19.923911 | 0.0 | 87.0 | 100.0 | 114.0 | 170.0 |
| sale product views | 3333.0 | 1.001077e+02 | 19.568609 | 33.0 | 87.0 | 100.0 | 113.0 | 175.0 |
| app transactions | 3333.0 | 4.479448e+00 | 2.461214 | 0.0 | 3.0 | 4.0 | 6.0 | 20.0 |
| customer service calls | 3333.0 | 1.562856e+00 | 1.315491 | 0.0 | 1.0 | 1.0 | 2.0 | 9.0 |
| churn | 3333.0 | 1.449145e-01 | 0.352067 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
In [9]:
# Shape of the data
print("Data Shape:", data.shape)
# Unique values per column
unique_values = data.nunique()
unique_values
Data Shape: (3333, 20)
Out[9]:
| 0 | |
|---|---|
| account length | 212 |
| location code | 3 |
| user id | 3333 |
| credit card info save | 2 |
| push status | 2 |
| add to wishlist | 46 |
| desktop sessions | 295 |
| app sessions | 60 |
| desktop transactions | 30 |
| total product detail views | 119 |
| session duration | 287 |
| promotion clicks | 123 |
| avg order value | 1591 |
| sale product views | 120 |
| discount rate per visited products | 933 |
| product detail view per app session | 162 |
| app transactions | 21 |
| add to cart per session | 162 |
| customer service calls | 10 |
| churn | 2 |
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(data['account length'], kde=True)
plt.title('Distribution of Account Length')
plt.xlabel('Account Length (days)')
plt.show()
In [11]:
plt.figure(figsize=(8, 6))
sns.histplot(data['desktop sessions'], color='blue', label='Desktop Sessions', kde=True)
sns.histplot(data['app sessions'], color='orange', label='App Sessions', kde=True)
plt.title('Desktop vs. App Sessions')
plt.xlabel('Number of Sessions')
plt.legend()
plt.show()
In [12]:
plt.figure(figsize=(8, 6))
sns.histplot(data['avg order value'], kde=True, color='green')
plt.title('Distribution of Average Order Value')
plt.xlabel('Average Order Value')
plt.show()
In [13]:
# Count plot for churn status
plt.figure(figsize=(8, 6))
sns.countplot(x='churn', data=data, palette='viridis')
plt.title('Churn Status')
plt.xlabel('Churn')
plt.ylabel('Number of Customers')
plt.show()
<ipython-input-13-1e512c34adce>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='churn', data=data, palette='viridis')
In [14]:
# Count plots for each categorical variable
import matplotlib.pyplot as plt
import seaborn as sns
categorical_cols = ['credit card info save', 'push status'] # Replace with actual categorical column names
for col in categorical_cols:
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x=col, palette="viridis")
plt.title(f"Count of {col}")
plt.show()
<ipython-input-14-c5a6e8b1d039>:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=data, x=col, palette="viridis")
<ipython-input-14-c5a6e8b1d039>:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=data, x=col, palette="viridis")
In [15]:
# Pair plot for key numerical relationships, separated by churn status
sns.pairplot(data, hue='churn', palette="husl", diag_kind="kde") # 'churn' is the target column
plt.suptitle("Pairwise Plot by Churn Status", y=1.02)
plt.show()
In [16]:
# Violin plot to compare distributions of app and desktop metrics by churn status
app_metrics = ['app sessions', 'app transactions']
desktop_metrics = ['desktop sessions', 'desktop transactions']
for col in app_metrics + desktop_metrics:
plt.figure(figsize=(8, 5))
sns.violinplot(data=data, x='churn', y=col, palette="muted")
plt.title(f"{col} Distribution by Churn Status")
plt.show()
<ipython-input-16-00ccb04ca475>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(data=data, x='churn', y=col, palette="muted")
<ipython-input-16-00ccb04ca475>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(data=data, x='churn', y=col, palette="muted")
<ipython-input-16-00ccb04ca475>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(data=data, x='churn', y=col, palette="muted")
<ipython-input-16-00ccb04ca475>:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(data=data, x='churn', y=col, palette="muted")
In [17]:
import numpy as np
# Plotting the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap="coolwarm", fmt=".3f")
plt.title("Correlation Matrix of Numerical Columns")
plt.show()
In [18]:
summary_stats = data.describe().T
summary_stats['skew'] = data.select_dtypes(include=[np.number]).skew() # Skewness shows asymmetry of distribution
summary_stats
Out[18]:
| count | mean | std | min | 25% | 50% | 75% | max | skew | |
|---|---|---|---|---|---|---|---|---|---|
| account length | 3333.0 | 1.010648e+02 | 39.822106 | 1.0 | 74.0 | 101.0 | 127.0 | 243.0 | 0.096606 |
| location code | 3333.0 | 4.371824e+02 | 42.371290 | 408.0 | 408.0 | 415.0 | 510.0 | 510.0 | 1.126823 |
| user id | 3333.0 | 3.746291e+06 | 274662.573752 | 3271058.0 | 3508680.0 | 3748187.0 | 3985970.0 | 4229964.0 | 0.009732 |
| add to wishlist | 3333.0 | 8.099010e+00 | 13.688365 | 0.0 | 0.0 | 0.0 | 20.0 | 51.0 | 1.264824 |
| desktop sessions | 3333.0 | 1.798119e+02 | 54.457135 | 0.0 | 144.0 | 179.0 | 216.0 | 351.0 | -0.028737 |
| app sessions | 3333.0 | 3.056796e+01 | 9.269376 | 0.0 | 24.0 | 31.0 | 37.0 | 60.0 | -0.028420 |
| desktop transactions | 3333.0 | 1.708761e+01 | 4.323795 | 0.0 | 14.0 | 17.0 | 20.0 | 31.0 | -0.010819 |
| total product detail views | 3333.0 | 1.004356e+02 | 20.069084 | 0.0 | 87.0 | 101.0 | 114.0 | 165.0 | -0.111787 |
| session duration | 3333.0 | 2.010396e+02 | 50.714359 | 0.0 | 167.0 | 201.0 | 235.0 | 364.0 | -0.024248 |
| promotion clicks | 3333.0 | 1.001107e+02 | 19.923911 | 0.0 | 87.0 | 100.0 | 114.0 | 170.0 | -0.055096 |
| sale product views | 3333.0 | 1.001077e+02 | 19.568609 | 33.0 | 87.0 | 100.0 | 113.0 | 175.0 | 0.032500 |
| app transactions | 3333.0 | 4.479448e+00 | 2.461214 | 0.0 | 3.0 | 4.0 | 6.0 | 20.0 | 1.321478 |
| customer service calls | 3333.0 | 1.562856e+00 | 1.315491 | 0.0 | 1.0 | 1.0 | 2.0 | 9.0 | 1.091359 |
| churn | 3333.0 | 1.449145e-01 | 0.352067 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.018356 |
In [19]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Apply LabelEncoder on 'credit card info save' and 'push status'
data['credit card info save'] = label_encoder.fit_transform(data['credit card info save'])
data['push status'] = label_encoder.fit_transform(data['push status'])
# Display the first few rows to confirm the encoding
data.head()
Out[19]:
| account length | location code | user id | credit card info save | push status | add to wishlist | desktop sessions | app sessions | desktop transactions | total product detail views | session duration | promotion clicks | avg order value | sale product views | discount rate per visited products | product detail view per app session | app transactions | add to cart per session | customer service calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 415 | 3824657 | 0 | 1 | 25 | 265 | 45 | 17 | 110 | 197 | 87 | 244,7 | 91 | 11,01 | 10 | 3 | 2,7 | 1 | 0 |
| 1 | 107 | 415 | 3717191 | 0 | 1 | 26 | 162 | 27 | 17 | 123 | 196 | 103 | 254,4 | 103 | 11,45 | 13,7 | 3 | 3,7 | 1 | 0 |
| 2 | 137 | 415 | 3581921 | 0 | 0 | 0 | 243 | 41 | 10 | 114 | 121 | 110 | 162,6 | 104 | 7,32 | 12,2 | 5 | 3,29 | 0 | 0 |
| 3 | 84 | 408 | 3759999 | 1 | 0 | 0 | 299 | 51 | 5 | 71 | 62 | 88 | 196,9 | 89 | 8,86 | 6,6 | 7 | 1,78 | 2 | 0 |
| 4 | 75 | 415 | 3306626 | 1 | 0 | 0 | 167 | 28 | 13 | 113 | 148 | 122 | 186,9 | 121 | 8,41 | 10,1 | 3 | 2,73 | 3 | 0 |
In [20]:
# Box plot for key activity metrics
columns_to_plot = ['desktop sessions', 'app sessions', 'desktop transactions',
'app transactions', 'session duration', 'promotion clicks']
for column in columns_to_plot:
plt.figure(figsize=(8, 4))
sns.boxplot(data[column])
plt.title(f'Box Plot of {column}')
plt.show()
In [21]:
# Scatter plot of session duration vs. desktop transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='session duration', y='desktop transactions', hue='churn')
plt.title('Session Duration vs. Desktop Transactions')
plt.xlabel('Session Duration')
plt.ylabel('Desktop Transactions')
plt.show()
In [22]:
# Pair plot of selected features
sns.pairplot(data[['desktop sessions', 'app sessions', 'promotion clicks', 'session duration', 'churn']], hue='churn')
plt.suptitle('Pair Plot of Selected Features (Desktop, App Sessions, Promotions, Churn)', y=1.02)
plt.show()
In [23]:
# Bar plot of average transactions by churn status
plt.figure(figsize=(8, 5))
sns.barplot(data=data, x='churn', y='desktop transactions', estimator='mean', ci=None)
plt.title('Average Desktop Transactions by Churn Status')
plt.xlabel('Churn')
plt.ylabel('Average Desktop Transactions')
plt.show()
<ipython-input-23-016eed8ca896>:3: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=data, x='churn', y='desktop transactions', estimator='mean', ci=None)
In [24]:
# Scatter plot for session duration vs. desktop and app transactions
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x='session duration', y='desktop transactions', hue='churn')
plt.title('Relationship between Session Duration and Desktop Transactions')
plt.xlabel('Session Duration (minutes)')
plt.ylabel('Desktop Transactions')
plt.show()
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x='session duration', y='app transactions', hue='churn')
plt.title('Relationship between Session Duration and App Transactions')
plt.xlabel('Session Duration (minutes)')
plt.ylabel('App Transactions')
plt.show()
In [25]:
# Scatter plot for wishlist additions vs. transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='add to wishlist', y='desktop transactions', hue='churn')
plt.title('Wishlist Additions vs. Desktop Transactions')
plt.xlabel('Add to Wishlist Count')
plt.ylabel('Desktop Transactions')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='add to wishlist', y='app transactions', hue='churn')
plt.title('Wishlist Additions vs. App Transactions')
plt.xlabel('Add to Wishlist Count')
plt.ylabel('App Transactions')
plt.show()
In [26]:
# Scatter plot for promotion clicks vs. desktop and app transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='promotion clicks', y='desktop transactions', hue='churn')
plt.title('Promotion Clicks vs. Desktop Transactions')
plt.xlabel('Promotion Clicks')
plt.ylabel('Desktop Transactions')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='promotion clicks', y='app transactions', hue='churn')
plt.title('Promotion Clicks vs. App Transactions')
plt.xlabel('Promotion Clicks')
plt.ylabel('App Transactions')
plt.show()
In [27]:
# Box plot of customer service calls by churn status
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='churn', y='customer service calls')
plt.title('Customer Service Calls by Churn Status')
plt.xlabel('Churn Status')
plt.ylabel('Customer Service Calls')
plt.show()
In [31]:
data
Out[31]:
| account length | location code | user id | credit card info save | push status | add to wishlist | desktop sessions | app sessions | desktop transactions | total product detail views | session duration | promotion clicks | avg order value | sale product views | discount rate per visited products | product detail view per app session | app transactions | add to cart per session | customer service calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 415 | 3824657 | 0 | 1 | 25 | 265 | 45 | 17 | 110 | 197 | 87 | 244,7 | 91 | 11,01 | 10 | 3 | 2,7 | 1 | 0 |
| 1 | 107 | 415 | 3717191 | 0 | 1 | 26 | 162 | 27 | 17 | 123 | 196 | 103 | 254,4 | 103 | 11,45 | 13,7 | 3 | 3,7 | 1 | 0 |
| 2 | 137 | 415 | 3581921 | 0 | 0 | 0 | 243 | 41 | 10 | 114 | 121 | 110 | 162,6 | 104 | 7,32 | 12,2 | 5 | 3,29 | 0 | 0 |
| 3 | 84 | 408 | 3759999 | 1 | 0 | 0 | 299 | 51 | 5 | 71 | 62 | 88 | 196,9 | 89 | 8,86 | 6,6 | 7 | 1,78 | 2 | 0 |
| 4 | 75 | 415 | 3306626 | 1 | 0 | 0 | 167 | 28 | 13 | 113 | 148 | 122 | 186,9 | 121 | 8,41 | 10,1 | 3 | 2,73 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3328 | 192 | 415 | 4144276 | 0 | 1 | 36 | 156 | 27 | 18 | 77 | 216 | 126 | 279,1 | 83 | 12,56 | 9,9 | 6 | 2,67 | 2 | 0 |
| 3329 | 68 | 415 | 3703271 | 0 | 0 | 0 | 231 | 39 | 13 | 57 | 153 | 55 | 191,3 | 123 | 8,61 | 9,6 | 4 | 2,59 | 3 | 0 |
| 3330 | 28 | 510 | 3288230 | 0 | 0 | 0 | 181 | 31 | 25 | 109 | 289 | 58 | 191,9 | 91 | 8,64 | 14,1 | 6 | 3,81 | 2 | 0 |
| 3331 | 184 | 510 | 3646381 | 1 | 0 | 0 | 214 | 36 | 14 | 105 | 160 | 84 | 139,2 | 137 | 6,26 | 5 | 10 | 1,35 | 2 | 0 |
| 3332 | 74 | 415 | 4004344 | 0 | 1 | 25 | 234 | 40 | 23 | 113 | 266 | 82 | 241,4 | 77 | 10,86 | 13,7 | 4 | 3,7 | 0 | 0 |
3333 rows × 20 columns
In [32]:
# Convert commas to periods
converted_data = data["discount rate per visited products"].replace(',', '.')
# Print the result
print(converted_data)
0 11,01
1 11,45
2 7,32
3 8,86
4 8,41
...
3328 12,56
3329 8,61
3330 8,64
3331 6,26
3332 10,86
Name: discount rate per visited products, Length: 3333, dtype: object
In [32]:
In [33]:
data
Out[33]:
| account length | location code | user id | credit card info save | push status | add to wishlist | desktop sessions | app sessions | desktop transactions | total product detail views | session duration | promotion clicks | avg order value | sale product views | discount rate per visited products | product detail view per app session | app transactions | add to cart per session | customer service calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 415 | 3824657 | 0 | 1 | 25 | 265 | 45 | 17 | 110 | 197 | 87 | 244,7 | 91 | 11,01 | 10 | 3 | 2,7 | 1 | 0 |
| 1 | 107 | 415 | 3717191 | 0 | 1 | 26 | 162 | 27 | 17 | 123 | 196 | 103 | 254,4 | 103 | 11,45 | 13,7 | 3 | 3,7 | 1 | 0 |
| 2 | 137 | 415 | 3581921 | 0 | 0 | 0 | 243 | 41 | 10 | 114 | 121 | 110 | 162,6 | 104 | 7,32 | 12,2 | 5 | 3,29 | 0 | 0 |
| 3 | 84 | 408 | 3759999 | 1 | 0 | 0 | 299 | 51 | 5 | 71 | 62 | 88 | 196,9 | 89 | 8,86 | 6,6 | 7 | 1,78 | 2 | 0 |
| 4 | 75 | 415 | 3306626 | 1 | 0 | 0 | 167 | 28 | 13 | 113 | 148 | 122 | 186,9 | 121 | 8,41 | 10,1 | 3 | 2,73 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3328 | 192 | 415 | 4144276 | 0 | 1 | 36 | 156 | 27 | 18 | 77 | 216 | 126 | 279,1 | 83 | 12,56 | 9,9 | 6 | 2,67 | 2 | 0 |
| 3329 | 68 | 415 | 3703271 | 0 | 0 | 0 | 231 | 39 | 13 | 57 | 153 | 55 | 191,3 | 123 | 8,61 | 9,6 | 4 | 2,59 | 3 | 0 |
| 3330 | 28 | 510 | 3288230 | 0 | 0 | 0 | 181 | 31 | 25 | 109 | 289 | 58 | 191,9 | 91 | 8,64 | 14,1 | 6 | 3,81 | 2 | 0 |
| 3331 | 184 | 510 | 3646381 | 1 | 0 | 0 | 214 | 36 | 14 | 105 | 160 | 84 | 139,2 | 137 | 6,26 | 5 | 10 | 1,35 | 2 | 0 |
| 3332 | 74 | 415 | 4004344 | 0 | 1 | 25 | 234 | 40 | 23 | 113 | 266 | 82 | 241,4 | 77 | 10,86 | 13,7 | 4 | 3,7 | 0 | 0 |
3333 rows × 20 columns
In [34]:
data.to_csv('data_new.csv', index=False)
In [35]:
# Remove all commas from the data
df = data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
<ipython-input-35-5210660e964e>:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
df = data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
In [36]:
df
Out[36]:
| account length | location code | user id | credit card info save | push status | add to wishlist | desktop sessions | app sessions | desktop transactions | total product detail views | session duration | promotion clicks | avg order value | sale product views | discount rate per visited products | product detail view per app session | app transactions | add to cart per session | customer service calls | churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 415 | 3824657 | 0 | 1 | 25 | 265 | 45 | 17 | 110 | 197 | 87 | 2447 | 91 | 1101 | 10 | 3 | 27 | 1 | 0 |
| 1 | 107 | 415 | 3717191 | 0 | 1 | 26 | 162 | 27 | 17 | 123 | 196 | 103 | 2544 | 103 | 1145 | 137 | 3 | 37 | 1 | 0 |
| 2 | 137 | 415 | 3581921 | 0 | 0 | 0 | 243 | 41 | 10 | 114 | 121 | 110 | 1626 | 104 | 732 | 122 | 5 | 329 | 0 | 0 |
| 3 | 84 | 408 | 3759999 | 1 | 0 | 0 | 299 | 51 | 5 | 71 | 62 | 88 | 1969 | 89 | 886 | 66 | 7 | 178 | 2 | 0 |
| 4 | 75 | 415 | 3306626 | 1 | 0 | 0 | 167 | 28 | 13 | 113 | 148 | 122 | 1869 | 121 | 841 | 101 | 3 | 273 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3328 | 192 | 415 | 4144276 | 0 | 1 | 36 | 156 | 27 | 18 | 77 | 216 | 126 | 2791 | 83 | 1256 | 99 | 6 | 267 | 2 | 0 |
| 3329 | 68 | 415 | 3703271 | 0 | 0 | 0 | 231 | 39 | 13 | 57 | 153 | 55 | 1913 | 123 | 861 | 96 | 4 | 259 | 3 | 0 |
| 3330 | 28 | 510 | 3288230 | 0 | 0 | 0 | 181 | 31 | 25 | 109 | 289 | 58 | 1919 | 91 | 864 | 141 | 6 | 381 | 2 | 0 |
| 3331 | 184 | 510 | 3646381 | 1 | 0 | 0 | 214 | 36 | 14 | 105 | 160 | 84 | 1392 | 137 | 626 | 5 | 10 | 135 | 2 | 0 |
| 3332 | 74 | 415 | 4004344 | 0 | 1 | 25 | 234 | 40 | 23 | 113 | 266 | 82 | 2414 | 77 | 1086 | 137 | 4 | 37 | 0 | 0 |
3333 rows × 20 columns
In [38]:
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3333 entries, 0 to 3332 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 account length 3333 non-null int64 1 location code 3333 non-null int64 2 user id 3333 non-null int64 3 credit card info save 3333 non-null int64 4 push status 3333 non-null int64 5 add to wishlist 3333 non-null int64 6 desktop sessions 3333 non-null int64 7 app sessions 3333 non-null int64 8 desktop transactions 3333 non-null int64 9 total product detail views 3333 non-null int64 10 session duration 3333 non-null int64 11 promotion clicks 3333 non-null int64 12 avg order value 3333 non-null object 13 sale product views 3333 non-null int64 14 discount rate per visited products 3333 non-null object 15 product detail view per app session 3333 non-null object 16 app transactions 3333 non-null int64 17 add to cart per session 3333 non-null object 18 customer service calls 3333 non-null int64 19 churn 3333 non-null int64 dtypes: int64(16), object(4) memory usage: 520.9+ KB
In [39]:
# Convert columns with numeric strings to integers
for col in df.columns:
if df[col].dtype == 'object':
df[col] = pd.to_numeric(df[col], errors='coerce')
In [40]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3333 entries, 0 to 3332 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 account length 3333 non-null int64 1 location code 3333 non-null int64 2 user id 3333 non-null int64 3 credit card info save 3333 non-null int64 4 push status 3333 non-null int64 5 add to wishlist 3333 non-null int64 6 desktop sessions 3333 non-null int64 7 app sessions 3333 non-null int64 8 desktop transactions 3333 non-null int64 9 total product detail views 3333 non-null int64 10 session duration 3333 non-null int64 11 promotion clicks 3333 non-null int64 12 avg order value 3333 non-null int64 13 sale product views 3333 non-null int64 14 discount rate per visited products 3333 non-null int64 15 product detail view per app session 3333 non-null int64 16 app transactions 3333 non-null int64 17 add to cart per session 3333 non-null int64 18 customer service calls 3333 non-null int64 19 churn 3333 non-null int64 dtypes: int64(20) memory usage: 520.9 KB
In [41]:
# Define features and target variable
X = df.drop(columns=['churn', 'user id']) # Drop target and ID columns
y = df['churn']
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
Out[41]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [42]:
# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 1.00 0.97 857
1 0.96 0.68 0.80 143
accuracy 0.95 1000
macro avg 0.95 0.84 0.88 1000
weighted avg 0.95 0.95 0.95 1000
In [ ]: